mta_data = read_csv (file = "./data/MTA_recent_ridership_data_20201123_0.csv",
col_types = cols(
date = col_date(format = "%mm/%dd/%yy"),
`Subways: % Change From 2019 Equivalent Day` = col_number(),
`Buses: % Change From 2019 Equivalent Day` = col_number(),
`Bridges and Tunnels: % Change From 2019 Equivalent Day` = col_number()
) #only changed the formats of important variables
) %>%
janitor::clean_names()
skimr::skim(mta_data)
Data summary
| Name |
mta_data |
| Number of rows |
268 |
| Number of columns |
13 |
| _______________________ |
|
| Column type frequency: |
|
| character |
4 |
| numeric |
9 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| date |
0 |
1.00 |
6 |
8 |
0 |
268 |
0 |
| lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average |
31 |
0.88 |
4 |
4 |
0 |
41 |
0 |
| metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average |
31 |
0.88 |
4 |
4 |
0 |
39 |
0 |
| access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average |
0 |
1.00 |
5 |
7 |
0 |
222 |
0 |
Variable type: numeric
| subways_total_estimated_ridership |
0 |
1.00 |
1188754.06 |
907206.69 |
198693.0 |
590202.25 |
1082366.5 |
1606546.00 |
5515945.0 |
▇▅▁▁▁ |
| subways_percent_change_from_2019_equivalent_day |
0 |
1.00 |
-74.53 |
17.80 |
-93.5 |
-87.40 |
-75.2 |
-69.90 |
23.9 |
▇▂▁▁▁ |
| buses_total_estimated_ridership |
0 |
1.00 |
898386.33 |
367033.69 |
279100.0 |
591025.00 |
948100.0 |
1109492.00 |
2244500.0 |
▇▇▇▁▁ |
| buses_percent_change_from_2019_equivalent_day |
0 |
1.00 |
-51.25 |
18.09 |
-84.0 |
-63.50 |
-52.0 |
-43.00 |
40.0 |
▃▇▁▁▁ |
| lirr_total_estimated_ridership |
31 |
0.88 |
49484.81 |
29227.32 |
1900.0 |
22800.00 |
46800.0 |
76600.00 |
92500.0 |
▆▅▅▅▇ |
| metro_north_total_estimated_ridership |
31 |
0.88 |
40194.51 |
19249.81 |
5100.0 |
21200.00 |
43600.0 |
58200.00 |
77300.0 |
▇▅▇▇▅ |
| access_a_ride_total_scheduled_trips |
0 |
1.00 |
14138.28 |
6772.45 |
2506.0 |
8404.50 |
13073.0 |
19894.75 |
34304.0 |
▇▇▇▂▁ |
| bridges_and_tunnels_total_traffic |
0 |
1.00 |
668395.02 |
189440.49 |
177590.0 |
541411.50 |
744720.0 |
815673.75 |
938167.0 |
▁▃▃▆▇ |
| bridges_and_tunnels_percent_change_from_2019_equivalent_day |
0 |
1.00 |
-28.59 |
20.48 |
-80.1 |
-43.02 |
-20.4 |
-13.57 |
27.4 |
▃▃▇▇▁ |
mta_data =
mta_data %>%
subset(select = -c(lirr_total_estimated_ridership, lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average, metro_north_total_estimated_ridership, metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average, access_a_ride_total_scheduled_trips, access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average, bridges_and_tunnels_total_traffic, bridges_and_tunnels_percent_change_from_2019_equivalent_day))
mta_data =
mta_data %>%
mutate(
'subway_2019' = subways_total_estimated_ridership/(1+(subways_percent_change_from_2019_equivalent_day/100)),
'bus_2019'=
buses_total_estimated_ridership/(1+(buses_percent_change_from_2019_equivalent_day/100))
) %>%
rename(
"subway_2020" = subways_total_estimated_ridership,
"subway_pct_change" = subways_percent_change_from_2019_equivalent_day,
"bus_2020" = buses_total_estimated_ridership,
"bus_pct_change" = buses_percent_change_from_2019_equivalent_day
)
#change date to date format and order by date
plot_mta =
mta_data %>%
mutate(
date= as.Date(date,format = "%m/%d")) %>%
arrange(date)
#create a text_label label
text_label =
plot_mta %>%
mutate(text_label = str_c("percent change from 2019 to 2020: ", subway_pct_change)) %>%
select(date, text_label)
#setting up graph for subway
#pivoting
plot_subway =
plot_mta %>%
select(subway_2020, subway_2019, date) %>%
melt(., id.vars = "date") %>%
#merge based on month
merge(text_label, by = "date")
#setting up graph for bus
#pivoting
plot_bus =
plot_mta %>%
select(bus_2020,bus_2019,date) %>%
melt(., id.vars = "date") %>%
#merge based on month
merge(text_label, by = "date")
#setting up for t-test
#separate by month and day
mta_data =
mta_data %>%
arrange(date) %>%
separate(date, into = c("month", "day", "year"))%>%
mutate(month = as.numeric(month),
day = as.numeric(day)) %>%
select(-c(year)) #drop year column
mta_subway_ridership =
mta_data %>%
group_by(month)%>%
summarize(
avg_subway_2019 = mean(subway_2019),
avg_subway_2020 = mean(subway_2020)
)
mta_2019_sample =
mta_data%>%
select(month, subway_2019) %>%
nest(subway_2019)%>%
mutate("subway_2019_sample" = data)%>%
select(-data)
mta_2020_sample =
mta_data%>%
select(month, subway_2020)%>%
nest(subway_2020)%>%
mutate("subway_2020_sample" = data)%>%
select(-data)
mta_samples =
bind_cols(mta_2019_sample, mta_2020_sample)%>%
select(-month...3)%>%
rename(month = month...1)
mta_t_test =
mta_samples%>%
mutate(t_test = map2(.x = subway_2019_sample, .y = subway_2020_sample, ~t.test(.x , .y) ),
t_test_results = map(t_test, broom::tidy))%>%
select(month, t_test_results)%>%
unnest(t_test_results)%>%
select(month,p.value)%>%
mutate(difference = case_when(
p.value >= 0.05 ~ "insignificant",
p.value < 0.05 ~ "significant"),
p.value = ifelse(
p.value< 0.001,"<0.001",round(p.value, digits = 4))) %>%
arrange(month)
mta_year_ttest =
bind_cols(mta_subway_ridership, mta_t_test)%>%
select(-month...4)%>%
rename(month = month...1)
#create a text_label label
text_label =
mta_year_ttest %>%
mutate(text_label = str_c("p-value: ",p.value, "\nDifference: ", difference)) %>%
select(month, text_label)
#pivoting
plot_ttest =
mta_subway_ridership %>%
rename(
"2020"=avg_subway_2020,
"2019"=avg_subway_2019
) %>%
melt(., id.vars = "month") %>%
#merge based on month
merge(text_label, by = "month")
subway_ridership = lm(subway_2020 ~ month, data = mta_data)
subway_ridership %>%
broom::tidy() %>%
select(term, estimate, p.value) %>%
knitr::kable(digits = 3)
| (Intercept) |
965079.87 |
0.000 |
| month |
32472.74 |
0.139 |
#to plot for regression line
ggplot(mta_data, aes(month, subway_2020)) +
geom_point() +
stat_smooth(method = lm)

- Show a plot of model residuals against fitted values – use add_predictions and add_residuals in making this plot.
mta_data %>%
modelr::add_predictions(subway_ridership) %>%
modelr::add_residuals(subway_ridership) %>%
ggplot(aes(x = pred, y = resid)) + geom_point() +
labs(x = "Predicted value",
y = "Residual")

Next, we need to propose a regression model for bus ridership in 2020. As the outcome (bus ridership) is continuous, we can fit a linear regression model.
bus_ridership = lm(bus_2020 ~ month, data = mta_data)
bus_ridership %>%
broom::tidy()
# A tibble: 2 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 717060. 64077. 11.2 4.46e-24
2 month 26325. 8733. 3.01 2.82e- 3
#Now we need to tidy the output and get only the intercept, slope and p-values
bus_ridership %>%
broom::tidy() %>%
select(term, estimate, p.value) %>%
knitr::kable(digits = 3)
| (Intercept) |
717060.27 |
0.000 |
| month |
26324.69 |
0.003 |
#to plot for regression line
ggplot(mta_data, aes(month, bus_2020)) +
geom_point() +
stat_smooth(method = lm)

Column {data-width=650}
-----------------------------------------------------------------------
### Chart A
<div class="knitr-options" data-fig-width="768" data-fig-height="576"></div>
```r
plot_ttest %>%
plot_ly(
x = ~month, y = ~value, type = "scatter", mode = "lines+markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Monthly Average Ridership of Subway 2019 vs 2020",
xaxis = list(title ="Months",range=c(3,11)),
yaxis = list(title="Average Ridership"),
legend = list(font = list(size = 10))
)
Column
Chart B
plot_subway %>%
plot_ly(
x = ~date, y = ~value, type = "scatter", mode = "markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Subway Ridership Trends 2019 - 2020",
xaxis = list(title ="Month/Day", tickformat = "%m/%d"), #drop year
yaxis = list(title="Ridership")) %>%
add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
Chart C
#plot_bus
plot_bus %>%
plot_ly(
x = ~date, y = ~value, type = "scatter", mode = "markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Bus Ridership Trends 2019 - 2020",
xaxis = list(title ="Month/Day", tickformat = "%m/%d"),
yaxis = list(title="Ridership")) %>%
add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
---
title: "MTA ridership 2019 - 20 Dashboard"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
source: embed
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(patchwork)
library(readr)
library(broom)
library(dbplyr)
library(viridis)
library(reshape2)
library(plotly)
library(lubridate)
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 8,
fig.height = 6,
out.width = "90%"
)
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom"))
```
``` {r importing data}
mta_data = read_csv (file = "./data/MTA_recent_ridership_data_20201123_0.csv",
col_types = cols(
date = col_date(format = "%mm/%dd/%yy"),
`Subways: % Change From 2019 Equivalent Day` = col_number(),
`Buses: % Change From 2019 Equivalent Day` = col_number(),
`Bridges and Tunnels: % Change From 2019 Equivalent Day` = col_number()
) #only changed the formats of important variables
) %>%
janitor::clean_names()
skimr::skim(mta_data)
mta_data =
mta_data %>%
subset(select = -c(lirr_total_estimated_ridership, lirr_percent_change_from_2019_monthly_weekday_saturday_sunday_average, metro_north_total_estimated_ridership, metro_north_percent_change_from_2019_monthly_weekday_saturday_sunday_average, access_a_ride_total_scheduled_trips, access_a_ride_percent_change_from_2019_monthly_weekday_saturday_sunday_average, bridges_and_tunnels_total_traffic, bridges_and_tunnels_percent_change_from_2019_equivalent_day))
```
``` {r calculate 2019 ridership bus subway}
mta_data =
mta_data %>%
mutate(
'subway_2019' = subways_total_estimated_ridership/(1+(subways_percent_change_from_2019_equivalent_day/100)),
'bus_2019'=
buses_total_estimated_ridership/(1+(buses_percent_change_from_2019_equivalent_day/100))
) %>%
rename(
"subway_2020" = subways_total_estimated_ridership,
"subway_pct_change" = subways_percent_change_from_2019_equivalent_day,
"bus_2020" = buses_total_estimated_ridership,
"bus_pct_change" = buses_percent_change_from_2019_equivalent_day
)
```
```{r plotting general trends of MTA ridership 2019 & 2020}
#change date to date format and order by date
plot_mta =
mta_data %>%
mutate(
date= as.Date(date,format = "%m/%d")) %>%
arrange(date)
#create a text_label label
text_label =
plot_mta %>%
mutate(text_label = str_c("percent change from 2019 to 2020: ", subway_pct_change)) %>%
select(date, text_label)
#setting up graph for subway
#pivoting
plot_subway =
plot_mta %>%
select(subway_2020, subway_2019, date) %>%
melt(., id.vars = "date") %>%
#merge based on month
merge(text_label, by = "date")
#setting up graph for bus
#pivoting
plot_bus =
plot_mta %>%
select(bus_2020,bus_2019,date) %>%
melt(., id.vars = "date") %>%
#merge based on month
merge(text_label, by = "date")
```
```{r}
#setting up for t-test
#separate by month and day
mta_data =
mta_data %>%
arrange(date) %>%
separate(date, into = c("month", "day", "year"))%>%
mutate(month = as.numeric(month),
day = as.numeric(day)) %>%
select(-c(year)) #drop year column
mta_subway_ridership =
mta_data %>%
group_by(month)%>%
summarize(
avg_subway_2019 = mean(subway_2019),
avg_subway_2020 = mean(subway_2020)
)
mta_2019_sample =
mta_data%>%
select(month, subway_2019) %>%
nest(subway_2019)%>%
mutate("subway_2019_sample" = data)%>%
select(-data)
mta_2020_sample =
mta_data%>%
select(month, subway_2020)%>%
nest(subway_2020)%>%
mutate("subway_2020_sample" = data)%>%
select(-data)
mta_samples =
bind_cols(mta_2019_sample, mta_2020_sample)%>%
select(-month...3)%>%
rename(month = month...1)
```
```{r t test}
mta_t_test =
mta_samples%>%
mutate(t_test = map2(.x = subway_2019_sample, .y = subway_2020_sample, ~t.test(.x , .y) ),
t_test_results = map(t_test, broom::tidy))%>%
select(month, t_test_results)%>%
unnest(t_test_results)%>%
select(month,p.value)%>%
mutate(difference = case_when(
p.value >= 0.05 ~ "insignificant",
p.value < 0.05 ~ "significant"),
p.value = ifelse(
p.value< 0.001,"<0.001",round(p.value, digits = 4))) %>%
arrange(month)
mta_year_ttest =
bind_cols(mta_subway_ridership, mta_t_test)%>%
select(-month...4)%>%
rename(month = month...1)
```
``` {r plotting set up}
#create a text_label label
text_label =
mta_year_ttest %>%
mutate(text_label = str_c("p-value: ",p.value, "\nDifference: ", difference)) %>%
select(month, text_label)
#pivoting
plot_ttest =
mta_subway_ridership %>%
rename(
"2020"=avg_subway_2020,
"2019"=avg_subway_2019
) %>%
melt(., id.vars = "month") %>%
#merge based on month
merge(text_label, by = "month")
```
```{r regression models for subway 2020}
subway_ridership = lm(subway_2020 ~ month, data = mta_data)
subway_ridership %>%
broom::tidy() %>%
select(term, estimate, p.value) %>%
knitr::kable(digits = 3)
#to plot for regression line
ggplot(mta_data, aes(month, subway_2020)) +
geom_point() +
stat_smooth(method = lm)
```
* Show a plot of model residuals against fitted values – use add_predictions and add_residuals in making this plot.
```{r}
mta_data %>%
modelr::add_predictions(subway_ridership) %>%
modelr::add_residuals(subway_ridership) %>%
ggplot(aes(x = pred, y = resid)) + geom_point() +
labs(x = "Predicted value",
y = "Residual")
```
Next, we need to propose a regression model for bus ridership in 2020.
As the outcome (bus ridership) is continuous, we can fit a linear regression model.
```{r}
bus_ridership = lm(bus_2020 ~ month, data = mta_data)
bus_ridership %>%
broom::tidy()
#Now we need to tidy the output and get only the intercept, slope and p-values
bus_ridership %>%
broom::tidy() %>%
select(term, estimate, p.value) %>%
knitr::kable(digits = 3)
#to plot for regression line
ggplot(mta_data, aes(month, bus_2020)) +
geom_point() +
stat_smooth(method = lm)
```
```
Column {data-width=650}
-----------------------------------------------------------------------
### Chart A
```{r}
plot_ttest %>%
plot_ly(
x = ~month, y = ~value, type = "scatter", mode = "lines+markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Monthly Average Ridership of Subway 2019 vs 2020",
xaxis = list(title ="Months",range=c(3,11)),
yaxis = list(title="Average Ridership"),
legend = list(font = list(size = 10))
)
```
Column {data-width=350}
-----------------------------------------------------------------------
### Chart B
```{r}
plot_subway %>%
plot_ly(
x = ~date, y = ~value, type = "scatter", mode = "markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Subway Ridership Trends 2019 - 2020",
xaxis = list(title ="Month/Day", tickformat = "%m/%d"), #drop year
yaxis = list(title="Ridership")) %>%
add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
```
### Chart C
```{r}
#plot_bus
plot_bus %>%
plot_ly(
x = ~date, y = ~value, type = "scatter", mode = "markers",
color = ~variable, text = ~text_label) %>%
layout (
title = "Bus Ridership Trends 2019 - 2020",
xaxis = list(title ="Month/Day", tickformat = "%m/%d"),
yaxis = list(title="Ridership")) %>%
add_lines(x =as.Date("2020-03-01"), line = list(dash="dot", color = 'red', width=0.5, opacity = 0.5),name = 'First case on 3/1') %>%
add_lines(x =as.Date("2020-04-07"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '100K cases in NYC on 04/07') %>%
add_lines(x =as.Date("2020-05-26"), line = list(dash="dot", color = 'red', width=0.5, alpha = 0.5),name = '200K cases in NYC on 05/26')
```